This report contains the findings on Solar data, additional data and stations data.
# General purpose
library(tidyverse)
library(data.table)
library(lubridate)
# Descriptive
library(skimr)
# Visualization
library(ggplot2)
library(PerformanceAnalytics)
library(corrplot)
# Mapping
library(leaflet)
library(leaflet.extras)
library(sf)
#library(gdtools)
# Calculations
library(forecast)
library(caret)
library(mice)
library(outliers)
library(foreach)
library(doParallel)
# Solar production dataset
data_solar_train
data_solar_test
data_station
data_add
f_check_na(data_solar_train)
f_check_na(data_solar_test)
f_check_na(data_station)
f_check_na(data_add)
Comment: There are no missing values in the solar_dataset and station dataset. There are 3.6% NAs in the additional_dataset. The missing values will be imputed.
criteria_variables(data_solar_train)
criteria_variables(data_solar_test)
criteria_variables(data_station)
criteria_variables(data_add)
Comment: No constant variables have been found.
Based on: https://www.kaggle.com/rtatman/data-cleaning-challenge-outliers https://cran.r-project.org/web/packages/outliers/outliers.pdf
library(knitr)
# solar_data_outlier_skim
plot(solar_data_outlier_hist)
#kable(solar_data_outlier_table)
The data is normally distributed. There is only one row that could be considered outliers.
skim(data_solar_train)
glimpse(data_solar_train)
skim(data_solar_test)
glimpse(data_solar_test)
skim(data_station)
glimpse(data_station)
skim(data_add)
glimpse(data_add)
In this step, weather stations are sorted by their respective solar energy production volume, in descending order. Then, we create 2 plots, the first one shows top weather stations, the second shows the bottom weather stations.
As shown in the graph below, the differences between stations are small.
#### 2 Rank position change over time
We are interested to find out how does the production of one station changes over years
This suggests variables apart from seasonality at play, which would move every station in the same direction.
#### 3 Total production seasonality, trend and cyclical movements
Descriptive plot to show the production recorded in millions over different granularities of time.
This section is based on https://towardsdatascience.com/forecasting-with-r-trends-and-seasonality-def24280e71f
map_production
## Warning: Removed 26733 rows containing non-finite values (stat_bin).
## Warning: Removed 26733 rows containing non-finite values (stat_density).
## Warning: Removed 26733 rows containing non-finite values (stat_boxplot).
## Warning: Removed 26733 rows containing non-finite values (stat_boxplot).
#### 3 Of predictors
data <- data_solar_train %>%
dplyr::select(data_solar_col_predi) %>%
pivot_longer(cols = all_of(data_solar_col_predi), names_to = 'PC', values_to = 'Value') %>%
filter(PC %in% data_solar_col_predi[1:50]) %>%
mutate(PC_f = factor(PC, levels=data_solar_col_predi))
ggplot(data = data, aes(x = Value)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="blue") +
facet_wrap(vars(PC_f))
ggplot(data = data, aes(x = Value)) +
geom_boxplot() +
stat_boxplot(coef = 1.5, outlier.colour = 'red', outlier.alpha = 0.1) +
facet_wrap(vars(PC_f)) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
Computed Variable Importance is stored in data_solar_importance. Used parallel computing to reduce run time.
## $KENT
## [1] "PC1" "PC4" "PC2" "PC5" "PC6" "PC7" "PC3" "PC24" "PC9"
## [10] "PC17" "PC26" "PC32" "PC42" "PC105" "PC44" "PC27" "PC78" "PC18"
## [19] "PC52" "PC79" "PC64" "PC51" "PC36" "PC31" "PC68" "PC55" "PC12"
## [28] "PC33" "PC57" "PC47" "PC35" "PC29" "PC49" "PC63" "PC104" "PC111"
## [37] "PC8" "PC69" "PC40" "PC71" "PC37" "PC60" "PC171" "PC92" "PC76"
## [46] "PC86" "PC113" "PC87" "PC102" "PC72"
##
## $BOIS
## [1] "PC1" "PC2" "PC4" "PC5" "PC6" "PC3" "PC7" "PC24" "PC9"
## [10] "PC17" "PC26" "PC32" "PC42" "PC105" "PC78" "PC79" "PC44" "PC18"
## [19] "PC27" "PC52" "PC51" "PC64" "PC31" "PC55" "PC36" "PC57" "PC68"
## [28] "PC12" "PC47" "PC35" "PC40" "PC49" "PC63" "PC29" "PC33" "PC104"
## [37] "PC71" "PC69" "PC102" "PC99" "PC111" "PC171" "PC37" "PC19" "PC72"
## [46] "PC86" "PC76" "PC25" "PC87" "PC92"
##
## $HOOK
## [1] "PC1" "PC2" "PC4" "PC5" "PC6" "PC24" "PC3" "PC7" "PC9"
## [10] "PC32" "PC17" "PC42" "PC26" "PC79" "PC78" "PC44" "PC51" "PC64"
## [19] "PC18" "PC105" "PC55" "PC71" "PC27" "PC36" "PC52" "PC35" "PC31"
## [28] "PC47" "PC19" "PC33" "PC29" "PC57" "PC68" "PC40" "PC104" "PC12"
## [37] "PC63" "PC49" "PC69" "PC92" "PC113" "PC25" "PC37" "PC83" "PC8"
## [46] "PC102" "PC39" "PC111" "PC84" "PC90"
##
## $HOLL
## [1] "PC1" "PC2" "PC4" "PC3" "PC5" "PC7" "PC6" "PC26" "PC24"
## [10] "PC32" "PC17" "PC42" "PC35" "PC51" "PC52" "PC79" "PC55" "PC9"
## [19] "PC19" "PC27" "PC78" "PC44" "PC18" "PC8" "PC12" "PC40" "PC47"
## [28] "PC10" "PC72" "PC99" "PC64" "PC25" "PC67" "PC29" "PC102" "PC33"
## [37] "PC37" "PC63" "PC171" "PC14" "PC83" "PC71" "PC108" "PC104" "PC134"
## [46] "PC39" "PC31" "PC68" "PC191" "PC113"
##
## $GOOD
## [1] "PC1" "PC2" "PC4" "PC5" "PC3" "PC6" "PC7" "PC24" "PC9"
## [10] "PC32" "PC26" "PC17" "PC42" "PC78" "PC51" "PC79" "PC55" "PC44"
## [19] "PC27" "PC35" "PC18" "PC64" "PC71" "PC47" "PC36" "PC29" "PC31"
## [28] "PC63" "PC52" "PC68" "PC40" "PC19" "PC69" "PC57" "PC104" "PC83"
## [37] "PC49" "PC102" "PC105" "PC12" "PC33" "PC25" "PC92" "PC132" "PC171"
## [46] "PC8" "PC99" "PC87" "PC72" "PC98"
#### 2 Correlation in additional dataset
## Date Date2 Year Month Day Day_Of_Year Day_Of_Week PC1
## 1: 19940101 1994-01-01 1994 Jan 1 1 Sat -2.369062
## 2: 19940102 1994-01-02 1994 Jan 2 2 Sun -1.435331
## 3: 19940103 1994-01-03 1994 Jan 3 3 Mon -2.363953
## 4: 19940104 1994-01-04 1994 Jan 4 4 Tue -2.368490
## 5: 19940105 1994-01-05 1994 Jan 5 5 Wed -2.369270
## ---
## 6905: 20121126 2012-11-26 2012 Nov 26 331 Mon -2.265355
## 6906: 20121127 2012-11-27 2012 Nov 27 332 Tue -2.367946
## 6907: 20121128 2012-11-28 2012 Nov 28 333 Wed -2.369270
## 6908: 20121129 2012-11-29 2012 Nov 29 334 Thu -2.369270
## 6909: 20121130 2012-11-30 2012 Nov 30 335 Fri -2.123829
## PC2 PC3 PC4 PC5 PC6 PC7
## 1: -0.002715645 -0.039081932 -0.11681269 -0.07793428 0.005238273 0.02668015
## 2: -0.122355290 -1.136113557 0.46335531 0.07877000 -0.193223280 0.08952285
## 3: -0.005389969 -0.042916029 -0.11055050 -0.07829999 0.006910410 0.02582001
## 4: -0.003755845 -0.037951901 -0.11616533 -0.07832806 0.004976962 0.02614419
## 5: -0.002799016 -0.038910817 -0.11653157 -0.07782609 0.005285113 0.02680567
## ---
## 6905: 0.061912741 0.009869963 -0.03417801 -0.09844832 0.014063291 0.06799417
## 6906: -0.003054048 -0.040614706 -0.11450746 -0.07838540 0.005477429 0.02709170
## 6907: -0.002799016 -0.038910817 -0.11653157 -0.07782609 0.005285113 0.02680567
## 6908: -0.002799016 -0.038910817 -0.11653157 -0.07782609 0.005285113 0.02680567
## 6909: -0.062055146 -0.175964180 0.09712956 -0.15232627 0.108951866 0.00736406
## PC8 PC9 PC10 PC11 PC12
## 1: -0.009740713 -0.02744692 -0.050923530 -0.01284876 0.005816641
## 2: -0.118495194 -0.20340695 -0.116173954 -0.03908174 0.147646632
## 3: -0.014245311 -0.02572064 -0.051248542 -0.01263975 0.005086564
## 4: -0.009992798 -0.02859995 -0.052349350 -0.01257577 0.006177383
## 5: -0.009782939 -0.02733479 -0.051076357 -0.01277669 0.005784063
## ---
## 6905: 0.034494222 -0.06034842 -0.072793477 -0.01145415 -0.022017488
## 6906: -0.011511069 -0.02785636 -0.050781528 -0.01177372 0.006699159
## 6907: -0.009782939 -0.02733479 -0.051076357 -0.01277669 0.005784063
## 6908: -0.009782939 -0.02733479 -0.051076357 -0.01277669 0.005784063
## 6909: -0.071413849 0.04700238 -0.006426135 0.05282723 0.035597750
## PC13 PC14 PC15 PC16 PC17
## 1: -0.010622667 0.0038386506 -0.01701840 -0.0009153023 0.003378700
## 2: 0.154508125 0.2715989648 0.03970377 0.0399565588 0.037457126
## 3: -0.014928836 0.0000164203 -0.01993932 -0.0027371574 0.005323348
## 4: -0.010098870 0.0035353834 -0.01770308 -0.0005581118 0.003326476
## 5: -0.010769349 0.0045897387 -0.01712074 -0.0010876214 0.003668625
## ---
## 6905: -0.003253991 0.0356136040 0.01584012 -0.0215880848 0.014874641
## 6906: -0.010955164 0.0043218279 -0.01520286 -0.0030397077 0.004461615
## 6907: -0.010769349 0.0045897387 -0.01712074 -0.0010876214 0.003668625
## 6908: -0.010769349 0.0045897387 -0.01712074 -0.0010876214 0.003668625
## 6909: -0.011099099 0.0074048907 -0.01396803 -0.0587894384 0.040724480
## PC18 PC19 PC20 PC21
## 1: 0.002855336 -0.012090065 -0.006362850 0.010723327
## 2: -0.060915614 -0.137336140 0.056082258 0.101144044
## 3: 0.005617310 -0.012777852 -0.001671337 0.009631818
## 4: 0.002651872 -0.012458554 -0.007880411 0.011086115
## 5: 0.002830638 -0.012510888 -0.006378872 0.010688443
## ---
## 6905: 0.033656069 -0.004108262 0.016277441 0.062529624
## 6906: 0.003676820 -0.012060542 -0.004722324 0.010644114
## 6907: 0.002830638 -0.012510888 -0.006378872 0.010688443
## 6908: 0.002830638 -0.012510888 -0.006378872 0.010688443
## 6909: 0.065569791 -0.028603732 0.014814198 0.003226338
data <- data_add_pca %>%
pivot_longer(cols = all_of(data_add_col_pca), names_to = 'PC', values_to = 'Value') %>%
mutate(PC_ = factor(PC, levels = data_add_col_pca))
ggplot(data = data, aes(x = Value)) +
geom_histogram(aes(y=..density..), colour="black", fill="white") +
geom_density(alpha=.2, fill="blue") +
facet_wrap(vars(PC_))
ggplot(data = data, aes(x = Value)) +
geom_boxplot() +
stat_boxplot(coef = 1.5, outlier.colour = 'red', outlier.alpha = 0.1) +
facet_wrap(vars(PC_)) +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())